import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

# 读取数据
df = pd.read_csv(r"C:\Users\Lenovo\Desktop\000\data\公众奖励模型训练数据.csv")

# 根据 labels 选择优选方案
df["best_plan"] = df.apply(lambda x: x["方案1"] if x["labels"] == 1 else x["方案2"], axis=1)

# 内置停用词表（可扩展）
stopwords = {"的", "了", "和", "是", "在", "与", "及", "对", "为", "有", "将", "以", "应"}

def tokenizer(text):
    return [w for w in jieba.lcut(text) if w not in stopwords and len(w) > 1]

# TF-IDF 计算
vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=100)
X = vectorizer.fit_transform(df["best_plan"])
keywords = vectorizer.get_feature_names_out()
weights = X.toarray().sum(axis=0)  # 每个词的全局权重

# 打包成 DataFrame，按权重排序
kw_df = pd.DataFrame({"关键词": keywords, "权重": weights})
kw_df = kw_df.sort_values(by="权重", ascending=False)

# 导出为 Excel
output_path = r"C:\Users\Lenovo\Desktop\000\价值维度匹配分析\公众偏好关键词.xlsx"
kw_df.to_excel(output_path, index=False)

